import numpy as np
import pandas as pd
import random

from config import *
random.seed(178006)

print('-------- MERGING TO CENSUS UNINSURED DATA -------------')
print()

age_cuts = pd.read_pickle(CUTS_AGE)
age_cuts = [0] + [int(cut) for cut in age_cuts.values] + [999]

# ------------- clean 2_chis_uninsured for merge -----------------------------

df_chis = pd.read_csv('../../data/2_chis_uninsured.csv')

df_chis = df_chis[conditionals +
                  demographics +
                  ['hh_size_adjustment',
                   'conditional_prob']]

# ------------- clean UninsuredRateByRAYearandAge for merge ------------------

df_census = pd.read_csv(CENSUS_PATH)
df_census.columns = ['year', 'ra', 'age', 'pop', 'pop_unins', 'percent_unins']

# age bins
df_census['age_'] = df_census.age.map(
        {'agebin.0to17': 0,
         'agebin.18to24': 18,
         'agebin.25to34': 25,
         'agebin.35to44': 35,
         'agebin.45to54': 45,
         'agebin.55to64': 55,
         'agebin.65andup': 65})

df_census['age'] = 0
for i in range(1, len(age_cuts)):
    low = age_cuts[i - 1] + 1
    high = age_cuts[i]
    df_census.loc[(df_census.age_ >= low) &
                  (df_census.age_ <= high),
                  'age'] = i + 1 # age bins start at 2......

# collapse
df_census = df_census.groupby(conditionals2).sum().pop_unins.reset_index()
df_census = df_census[df_census.year.isin([2014, 2015, 2016])]
df_census['metro'] = np.where(df_census.ra.isin(urban_ras), 1, 0)

# ------------- merge and multiply -------------------------------------------

assert len(df_chis) == len(df_chis[conditionals + demographics].drop_duplicates())
assert len(df_census) == len(df_census[conditionals2].drop_duplicates())

df = df_chis.merge(df_census, on=conditionals, how='left')
print(len(df[df.pop_unins.isna()]), 'missing groups in census data')
df = df[~df.pop_unins.isna()]

# adjust ACS individual uninsured rates by family size --> household rates
hh_size = df.copy()
hh_size['hh_size_expected'] = hh_size.hh_size_adjustment * hh_size.conditional_prob
hh_size = hh_size.groupby(conditionals2).sum().hh_size_expected.reset_index()

# merge back to main dataset
df = df.drop('hh_size_adjustment', axis=1)
df = df.merge(hh_size, on=conditionals2, how='left')

# multiply by conditional_prob and divide by hh_size_expected to
# estimate group uninsured rates
df['pop_unins_post'] = (df.pop_unins / df.hh_size_expected) * df.conditional_prob
df = df.rename({'pop_unins': 'pop_unins_prior'}, axis=1)

# clean up and output
df_out = df[conditionals2 + demographics +
            ['conditional_prob', 'hh_size_expected',
             'pop_unins_prior', 'pop_unins_post']]

assert len(df_out) == len(df_out[conditionals2 + demographics].drop_duplicates())

df_out.to_csv('../../data/3_uninsured_rates.csv', index=False)
